import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
pio.renderers.default = "notebook"
# Read in the corpus, but note the data type of every column in advance
# we're doing this in order to save up on resources, if the column is strictly typed
# it's going to take way less RAM.
df = pd.read_csv(
"data/scrubbed_syn_udp_ben.csv",
dtype={
"index": int,
"flow_id": str,
"src_ip": str,
"src_port": int,
"dst_ip": str,
"dst_port": int,
"protocol": int,
"ts": str,
"flow_duration": int,
"total_fwd_packets": int,
"total_backward_packets": int,
"total_length_fwd_packets": int,
"total_length_bwd_packets": int,
"fwd_packet_length_max": int,
"fwd_packet_length_min": int,
"fwd_packet_length_mean": float,
"fwd_packet_length_std": float,
"bwd_packet_length_max": int,
"bwd_packet_length_min": int,
"bwd_packet_length_mean": float,
"bwd_packet_length_std": float,
"flow_Bps": float,
"flow_pps": float,
"flow_iat_mean": float,
"flow_iat_std": float,
"flow_iat_max": int,
"flow_iat_min": int,
"flow_iat_total": int,
"fwd_iat_mean": float,
"fwd_iat_std": float,
"fwd_iat_max": int,
"fwd_iat_min": int,
"bwd_iat_total": int,
"bwd_iat_mean": float,
"bwd_iat_std": float,
"bwd_iat_max": float,
"bwd_iat_min": float,
"fwd_psh_flags": int,
"bwd_psh_flags": int,
"fwd_urg_flags": int,
"bwd_urg_flags": int,
"fwd_header_length": int,
"bwd_header_length": int,
"fwd_pps": float,
"bwd_pps": float,
"min_packet_length": int,
"max_packet_length": int,
"packet_length_mean": float,
"packet_length_std": float,
"packet_length_variance": float,
"fin_flag_count": int,
"syn_flag_count": int,
"rst_flag_count": int,
"psh_flag_count": int,
"ack_flag_count": int,
"urg_flag_count": int,
"cwe_flag_count": int,
"ece_flag_count": int,
"down_up_ratio": int,
"avg_packet_size": float,
"avg_fwd_segment_size": float,
"avg_bwd_segment_size": float,
"fwd_header_length_1": float,
"fwd_avg_bytes_bulk": float,
"fwd_avg_packets_bulk": float,
"fwd_avg_bulk_rate": float,
"bwd_avg_bytes_bulk": float,
"bwd_avg_packets_bulk": float,
"bwd_avg_bulk_rate": float,
"subflow_fwd_packets": float,
"subflow_fwd_bytes": float,
"subflow_bwd_packets": float,
"subflow_bwd_bytes": float,
"init_win_bytes_forward": float,
"init_win_bytes_backward": float,
"act_data_pkt_fwd": float,
"min_seg_size_forward": float,
"active_mean": float,
"active_std": float,
"active_max": int,
"active_min": int,
"idle_mean": float,
"idle_std": float,
"idle_max": int,
"idle_min": int,
"simillar_http": str,
"inbound": int,
"label": str,
},
)
# Get the distribution by label(target variable y)
df_group_count = (
df.groupby(["label"])
.count()
.reset_index()
.rename(columns={"index": "count"})[["label", "count"]]
)
print(f"Total number of rows: {df_group_count['count'].sum()}")
Total number of rows: 413828
fig = px.bar(
df_group_count,
x="label",
y="count",
title = 'Row distribution by label',
color="label",
template="plotly_white"
)
fig.update_layout(title_x=0.5)
fig.show()
def get_color_list(attributes):
return list(dict(zip(attributes, px.colors.DEFAULT_PLOTLY_COLORS)).values())
# Various EDA plots.
df_label_means = df.groupby(["label"]).quantile(0.95).reset_index()
fig = make_subplots(rows=1, cols=1, specs=[[{"secondary_y": True}]],)
fig.layout.template = "plotly_white"
fig.add_trace(
go.Bar(
x=df_label_means["label"],
y=df_label_means["flow_duration"],
name="Flow Duration 95th percentile",
marker=dict(color=get_color_list(df_label_means["label"]),),
),
)
fig.add_trace(
go.Scatter(
x=df_label_means["label"],
y=df_label_means["total_length_fwd_packets"],
mode="lines",
name="Total forwarded packets length 95th percentile",
line=dict(color="black"),
opacity=0.5,
),
secondary_y=True,
)
# Plot log scale for length
fig.update_yaxes(type="log")
# Plot linear scale for forwarded packets.
fig.update_yaxes(type="linear", secondary_y=True)
fig.update_layout(
title_x=0.5,
title_text="95th percentile of flow duration and total forwarded packets length grouped by target variable",
)
fig.show()
fig = make_subplots(rows=1, cols=1, specs=[[{"secondary_y": True}]],)
fig.layout.template = "plotly_white"
fig.add_trace(
go.Bar(
x=df_label_means["label"],
y=df_label_means["fwd_pps"],
name="95th percentile sent pps",
marker=dict(color=get_color_list(df_label_means["label"]),),
),
)
fig.add_trace(
go.Scatter(
x=df_label_means["label"],
y=df_label_means["bwd_pps"],
mode="lines",
name="95th percentile received pps",
line=dict(color="black"),
opacity=0.5,
),
secondary_y=True,
)
fig.update_yaxes(type="log")
fig.update_yaxes(type="linear", secondary_y=True)
fig.update_layout(
title_x=0.5,
title_text="95th percentile forwarded vs received packets per second 95th percentile",
)
fig.show()
df_labels_quantiles = (
df.groupby(["label"])
.quantile(np.arange(0, 1.1, 0.1))
.reset_index()
.rename(columns={"level_1": "quantile"})
)
df_labels_quantiles.head()
| label | quantile | index | src_port | dst_port | protocol | flow_duration | total_fwd_packets | total_backward_packets | total_length_fwd_packets | ... | min_seg_size_forward | active_mean | active_std | active_max | active_min | idle_mean | idle_std | idle_max | idle_min | inbound | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BENIGN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | ... | -1.408238e+09 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | BENIGN | 0.1 | 296.0 | 443.0 | 53.0 | 6.0 | 2.0 | 1.0 | 0.0 | 0.0 | ... | 2.000000e+01 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | BENIGN | 0.2 | 1240.4 | 443.0 | 53.0 | 6.0 | 127.0 | 2.0 | 0.0 | 0.0 | ... | 2.000000e+01 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | BENIGN | 0.3 | 2888.0 | 50539.0 | 53.0 | 6.0 | 804.1 | 2.0 | 2.0 | 6.0 | ... | 2.000000e+01 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | BENIGN | 0.4 | 6007.8 | 52709.0 | 80.0 | 6.0 | 20701.0 | 2.0 | 2.0 | 31.0 | ... | 2.000000e+01 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 84 columns
px.line(
df_labels_quantiles,
x="flow_duration",
y="total_fwd_packets",
line_dash="label",
color="label",
template="plotly_white",
title = "Number of sent packets based off of flow duration grouped by target variable"
)
# PCA
scale = preprocessing.StandardScaler()
X = scale.fit_transform(
df[
[
"protocol",
"flow_duration",
"total_fwd_packets",
"total_backward_packets",
"total_length_fwd_packets",
"total_length_bwd_packets",
"fwd_packet_length_max",
"fwd_packet_length_min",
"fwd_packet_length_mean",
"fwd_packet_length_std",
"bwd_packet_length_max",
"bwd_packet_length_min",
"bwd_packet_length_mean",
"bwd_packet_length_std",
"flow_Bps",
"flow_pps",
"flow_iat_mean",
"flow_iat_std",
"flow_iat_max",
"flow_iat_min",
"flow_iat_total",
"fwd_iat_mean",
"fwd_iat_std",
"fwd_iat_max",
"fwd_iat_min",
"bwd_iat_total",
"bwd_iat_mean",
"bwd_iat_std",
"bwd_iat_max",
"bwd_iat_min",
"fwd_psh_flags",
"bwd_psh_flags",
"fwd_urg_flags",
"bwd_urg_flags",
"fwd_header_length",
"bwd_header_length",
"fwd_pps",
"bwd_pps",
"min_packet_length",
"max_packet_length",
"packet_length_mean",
"packet_length_std",
"packet_length_variance",
"fin_flag_count",
"syn_flag_count",
"rst_flag_count",
"psh_flag_count",
"ack_flag_count",
"urg_flag_count",
"cwe_flag_count",
"ece_flag_count",
"down_up_ratio",
"avg_packet_size",
"avg_fwd_segment_size",
"avg_bwd_segment_size",
"fwd_header_length_1",
"fwd_avg_bytes_bulk",
"fwd_avg_packets_bulk",
"fwd_avg_bulk_rate",
"bwd_avg_bytes_bulk",
"bwd_avg_packets_bulk",
"bwd_avg_bulk_rate",
"subflow_fwd_packets",
"subflow_fwd_bytes",
"subflow_bwd_packets",
"subflow_bwd_bytes",
"init_win_bytes_forward",
"init_win_bytes_backward",
"act_data_pkt_fwd",
"min_seg_size_forward",
"active_mean",
"active_std",
"active_max",
"active_min",
"idle_mean",
"idle_std",
"idle_max",
"idle_min",
"inbound",
]
]
)
X_norm = preprocessing.normalize(X)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X_norm)
df_principalComponents = pd.DataFrame(principalComponents)
df_principalComponents["label"] = df["label"]
df_principalComponents.columns = ["comp_1", "comp_2", "label"]
px.scatter(
df_principalComponents,
"comp_1",
"comp_2",
color="label",
opacity=0.3,
template="plotly_white",
title = "Principal Component Analysis based off of the relevant attributes"
)